Business Scenario • The data provided is from a Personal Loans Campaign executed by MyBank. • 20000 customers were targeted with an offer of Personal Loans at 10% interest rate. • 2512 customers out of 20000 responded expressing their need for Personal Loan; These customers are labelled as Target = 1 and remaining customers are labelled as Target = 0
The motive of thi sproject is to build a machine learning model to predict the target variable
#Importing the dataset
raw_data <- read.csv('Personal Loan Campaign-dataset.csv', header = TRUE)
print(head(raw_data))
## CUST_ID TARGET AGE GENDER BALANCE OCCUPATION AGE_BKT SCR
## 1 C7927 0 27 M 3383.75 SELF-EMP 26-30 776
## 2 C6877 0 47 M 287489.04 SAL 46-50 324
## 3 C19922 0 40 M 18216.88 SELF-EMP 36-40 603
## 4 C8183 0 53 M 71720.48 SAL >50 196
## 5 C12123 0 36 M 1671622.89 PROF 36-40 167
## 6 C257 0 42 F 521685.69 PROF 41-45 493
## HOLDING_PERIOD ACC_TYPE ACC_OP_DATE LEN_OF_RLTN_IN_MNTH NO_OF_L_CR_TXNS
## 1 30 SA 3/23/2005 146 7
## 2 28 SA 10-11-08 104 8
## 3 2 SA 4/26/2012 61 10
## 4 13 CA 07-04-08 107 36
## 5 24 SA 12/29/2001 185 20
## 6 26 SA 06-07-01 192 5
## NO_OF_L_DR_TXNS TOT_NO_OF_L_TXNS NO_OF_BR_CSH_WDL_DR_TXNS
## 1 3 10 0
## 2 2 10 0
## 3 5 15 1
## 4 14 50 4
## 5 1 21 1
## 6 2 7 1
## NO_OF_ATM_DR_TXNS NO_OF_NET_DR_TXNS NO_OF_MOB_DR_TXNS NO_OF_CHQ_DR_TXNS
## 1 1 2 0 0
## 2 1 1 0 0
## 3 1 1 0 2
## 4 2 3 1 4
## 5 0 0 0 0
## 6 1 0 0 0
## FLG_HAS_CC AMT_ATM_DR AMT_BR_CSH_WDL_DR AMT_CHQ_DR AMT_NET_DR AMT_MOB_DR
## 1 0 13100 0 0 973557 0
## 2 0 6600 0 0 799813 0
## 3 0 11200 561120 49320 997570 0
## 4 0 26100 673590 60780 741506 71388
## 5 0 0 808480 0 0 0
## 6 1 18500 379310 0 0 0
## AMT_L_DR FLG_HAS_ANY_CHGS AMT_OTH_BK_ATM_USG_CHGS AMT_MIN_BAL_NMC_CHGS
## 1 986657 0 0 0
## 2 806413 1 0 0
## 3 1619210 1 0 0
## 4 1573364 0 0 0
## 5 808480 0 0 0
## 6 397810 0 0 0
## NO_OF_IW_CHQ_BNC_TXNS NO_OF_OW_CHQ_BNC_TXNS AVG_AMT_PER_ATM_TXN
## 1 0 0 13100
## 2 0 0 6600
## 3 0 1 11200
## 4 0 0 13050
## 5 0 0 0
## 6 0 0 18500
## AVG_AMT_PER_CSH_WDL_TXN AVG_AMT_PER_CHQ_TXN AVG_AMT_PER_NET_TXN
## 1 0.0 0 486778.5
## 2 0.0 0 799813.0
## 3 561120.0 24660 997570.0
## 4 168397.5 15195 247168.7
## 5 808480.0 0 0.0
## 6 379310.0 0 0.0
## AVG_AMT_PER_MOB_TXN FLG_HAS_NOMINEE FLG_HAS_OLD_LOAN random
## 1 0 1 1 0.000011400
## 2 0 1 0 0.000111373
## 3 0 1 1 0.000119954
## 4 71388 1 0 0.000136825
## 5 0 1 0 0.000173976
## 6 0 1 1 0.000405840
# Looking at the summary dataset on a high level
summary(raw_data)
## CUST_ID TARGET AGE GENDER
## C1 : 1 Min. :0.0000 Min. :21.00 F: 5433
## C10 : 1 1st Qu.:0.0000 1st Qu.:30.00 M:14376
## C100 : 1 Median :0.0000 Median :38.00 O: 191
## C1000 : 1 Mean :0.1256 Mean :38.42
## C10000 : 1 3rd Qu.:0.0000 3rd Qu.:46.00
## C10001 : 1 Max. :1.0000 Max. :55.00
## (Other):19994
## BALANCE OCCUPATION AGE_BKT SCR
## Min. : 0 PROF :5417 <25 :1753 Min. :100.0
## 1st Qu.: 64754 SAL :5855 26-30:3434 1st Qu.:227.0
## Median : 231676 SELF-EMP:3568 31-35:3404 Median :364.0
## Mean : 511362 SENP :5160 36-40:2814 Mean :440.2
## 3rd Qu.: 653877 41-45:3067 3rd Qu.:644.0
## Max. :8360431 46-50:2493 Max. :999.0
## >50 :3035
## HOLDING_PERIOD ACC_TYPE ACC_OP_DATE LEN_OF_RLTN_IN_MNTH
## Min. : 1.00 CA: 4241 11/16/2010: 24 Min. : 29.0
## 1st Qu.: 7.00 SA:15759 04-03-09 : 23 1st Qu.: 79.0
## Median :15.00 7/25/2010 : 22 Median :125.0
## Mean :14.96 05-06-13 : 21 Mean :125.2
## 3rd Qu.:22.00 02-07-07 : 20 3rd Qu.:172.0
## Max. :31.00 8/24/2010 : 20 Max. :221.0
## (Other) :19870
## NO_OF_L_CR_TXNS NO_OF_L_DR_TXNS TOT_NO_OF_L_TXNS
## Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 6.00 1st Qu.: 2.000 1st Qu.: 9.00
## Median :10.00 Median : 5.000 Median : 14.00
## Mean :12.35 Mean : 6.634 Mean : 18.98
## 3rd Qu.:14.00 3rd Qu.: 7.000 3rd Qu.: 21.00
## Max. :75.00 Max. :74.000 Max. :149.00
##
## NO_OF_BR_CSH_WDL_DR_TXNS NO_OF_ATM_DR_TXNS NO_OF_NET_DR_TXNS
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 1.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 1.000 Median : 1.000 Median : 0.000
## Mean : 1.883 Mean : 1.029 Mean : 1.172
## 3rd Qu.: 2.000 3rd Qu.: 1.000 3rd Qu.: 1.000
## Max. :15.000 Max. :25.000 Max. :22.000
##
## NO_OF_MOB_DR_TXNS NO_OF_CHQ_DR_TXNS FLG_HAS_CC AMT_ATM_DR
## Min. : 0.0000 Min. : 0.000 Min. :0.0000 Min. : 0
## 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.: 0
## Median : 0.0000 Median : 2.000 Median :0.0000 Median : 6900
## Mean : 0.4118 Mean : 2.138 Mean :0.3054 Mean : 10990
## 3rd Qu.: 0.0000 3rd Qu.: 4.000 3rd Qu.:1.0000 3rd Qu.: 15800
## Max. :25.0000 Max. :15.000 Max. :1.0000 Max. :199300
##
## AMT_BR_CSH_WDL_DR AMT_CHQ_DR AMT_NET_DR AMT_MOB_DR
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 2990 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
## Median :340150 Median : 23840 Median : 0 Median : 0
## Mean :378474 Mean : 124520 Mean :237308 Mean : 22425
## 3rd Qu.:674675 3rd Qu.: 72470 3rd Qu.:473970 3rd Qu.: 0
## Max. :999930 Max. :4928640 Max. :999854 Max. :199667
##
## AMT_L_DR FLG_HAS_ANY_CHGS AMT_OTH_BK_ATM_USG_CHGS
## Min. : 0 Min. :0.0000 Min. : 0.000
## 1st Qu.: 237936 1st Qu.:0.0000 1st Qu.: 0.000
## Median : 695115 Median :0.0000 Median : 0.000
## Mean : 773717 Mean :0.1106 Mean : 1.099
## 3rd Qu.:1078927 3rd Qu.:0.0000 3rd Qu.: 0.000
## Max. :6514921 Max. :1.0000 Max. :250.000
##
## AMT_MIN_BAL_NMC_CHGS NO_OF_IW_CHQ_BNC_TXNS NO_OF_OW_CHQ_BNC_TXNS
## Min. : 0.000 Min. :0.00000 Min. :0.0000
## 1st Qu.: 0.000 1st Qu.:0.00000 1st Qu.:0.0000
## Median : 0.000 Median :0.00000 Median :0.0000
## Mean : 1.292 Mean :0.04275 Mean :0.0444
## 3rd Qu.: 0.000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :170.000 Max. :2.00000 Max. :2.0000
##
## AVG_AMT_PER_ATM_TXN AVG_AMT_PER_CSH_WDL_TXN AVG_AMT_PER_CHQ_TXN
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 1266 1st Qu.: 0
## Median : 6000 Median :147095 Median : 8645
## Mean : 7409 Mean :242236 Mean : 25092
## 3rd Qu.:13500 3rd Qu.:385000 3rd Qu.: 28605
## Max. :25000 Max. :999640 Max. :537842
##
## AVG_AMT_PER_NET_TXN AVG_AMT_PER_MOB_TXN FLG_HAS_NOMINEE FLG_HAS_OLD_LOAN
## Min. : 0 Min. : 0 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0 1st Qu.:1.0000 1st Qu.:0.0000
## Median : 0 Median : 0 Median :1.0000 Median :0.0000
## Mean :179059 Mean : 20304 Mean :0.9012 Mean :0.4929
## 3rd Qu.:257699 3rd Qu.: 0 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :999854 Max. :199667 Max. :1.0000 Max. :1.0000
##
## random
## Min. :0.0000114
## 1st Qu.:0.2481866
## Median :0.5061214
## Mean :0.5019330
## 3rd Qu.:0.7535712
## Max. :0.9999471
##
The dataset looks clean without any missing values.
require('DataExplorer')
## Loading required package: DataExplorer
| Name | Value |
|---|---|
| Rows | 20,000 |
| Columns | 40 |
| Discrete columns | 6 |
| Continuous columns | 34 |
| All missing columns | 0 |
| Missing observations | 0 |
| Complete Rows | 20,000 |
| Total observations | 800,000 |
| Memory allocation | 5.3 Mb |
## 2 columns ignored with more than 50 categories.
## CUST_ID: 20000 categories
## ACC_OP_DATE: 4869 categories
## 2 features with more than 20 categories ignored!
## CUST_ID: 20000 categories
## ACC_OP_DATE: 4869 categories
## 2 features with more than 50 categories ignored!
## CUST_ID: 20000 categories
## ACC_OP_DATE: 4869 categories
# Removing cust_id column as it seems not important because it is just an incremented value
raw_data <- subset(raw_data, select = c(-CUST_ID))
Check for zero and non zero variance
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
nzv <- nearZeroVar(raw_data, saveMetrics=TRUE)
nzv <-cbind(row_name = rownames(nzv),nzv)
print(nzv) # this shows NO near zero or zero varaiance
## row_name freqRatio
## TARGET TARGET 6.961783
## AGE AGE 1.073171
## GENDER GENDER 2.646052
## BALANCE BALANCE 2.111111
## OCCUPATION OCCUPATION 1.080857
## AGE_BKT AGE_BKT 1.008813
## SCR SCR 1.044776
## HOLDING_PERIOD HOLDING_PERIOD 1.336165
## ACC_TYPE ACC_TYPE 3.715869
## ACC_OP_DATE ACC_OP_DATE 1.043478
## LEN_OF_RLTN_IN_MNTH LEN_OF_RLTN_IN_MNTH 1.025316
## NO_OF_L_CR_TXNS NO_OF_L_CR_TXNS 1.553664
## NO_OF_L_DR_TXNS NO_OF_L_DR_TXNS 1.109273
## TOT_NO_OF_L_TXNS TOT_NO_OF_L_TXNS 1.189585
## NO_OF_BR_CSH_WDL_DR_TXNS NO_OF_BR_CSH_WDL_DR_TXNS 1.347703
## NO_OF_ATM_DR_TXNS NO_OF_ATM_DR_TXNS 1.573045
## NO_OF_NET_DR_TXNS NO_OF_NET_DR_TXNS 1.635942
## NO_OF_MOB_DR_TXNS NO_OF_MOB_DR_TXNS 4.021603
## NO_OF_CHQ_DR_TXNS NO_OF_CHQ_DR_TXNS 1.682764
## FLG_HAS_CC FLG_HAS_CC 2.274394
## AMT_ATM_DR AMT_ATM_DR 66.094737
## AMT_BR_CSH_WDL_DR AMT_BR_CSH_WDL_DR 331.533333
## AMT_CHQ_DR AMT_CHQ_DR 405.142857
## AMT_NET_DR AMT_NET_DR 813.692308
## AMT_MOB_DR AMT_MOB_DR 1545.100000
## AMT_L_DR AMT_L_DR 57.692308
## FLG_HAS_ANY_CHGS FLG_HAS_ANY_CHGS 8.041591
## AMT_OTH_BK_ATM_USG_CHGS AMT_OTH_BK_ATM_USG_CHGS 451.340909
## AMT_MIN_BAL_NMC_CHGS AMT_MIN_BAL_NMC_CHGS 130.578947
## NO_OF_IW_CHQ_BNC_TXNS NO_OF_IW_CHQ_BNC_TXNS 22.553592
## NO_OF_OW_CHQ_BNC_TXNS NO_OF_OW_CHQ_BNC_TXNS 21.572235
## AVG_AMT_PER_ATM_TXN AVG_AMT_PER_ATM_TXN 62.168317
## AVG_AMT_PER_CSH_WDL_TXN AVG_AMT_PER_CSH_WDL_TXN 452.090909
## AVG_AMT_PER_CHQ_TXN AVG_AMT_PER_CHQ_TXN 425.400000
## AVG_AMT_PER_NET_TXN AVG_AMT_PER_NET_TXN 813.692308
## AVG_AMT_PER_MOB_TXN AVG_AMT_PER_MOB_TXN 1545.100000
## FLG_HAS_NOMINEE FLG_HAS_NOMINEE 9.116338
## FLG_HAS_OLD_LOAN FLG_HAS_OLD_LOAN 1.028603
## random random 1.000000
## percentUnique zeroVar nzv
## TARGET 0.010 FALSE FALSE
## AGE 0.175 FALSE FALSE
## GENDER 0.015 FALSE FALSE
## BALANCE 49.620 FALSE FALSE
## OCCUPATION 0.020 FALSE FALSE
## AGE_BKT 0.035 FALSE FALSE
## SCR 4.480 FALSE FALSE
## HOLDING_PERIOD 0.155 FALSE FALSE
## ACC_TYPE 0.010 FALSE FALSE
## ACC_OP_DATE 24.345 FALSE FALSE
## LEN_OF_RLTN_IN_MNTH 0.965 FALSE FALSE
## NO_OF_L_CR_TXNS 0.380 FALSE FALSE
## NO_OF_L_DR_TXNS 0.240 FALSE FALSE
## TOT_NO_OF_L_TXNS 0.490 FALSE FALSE
## NO_OF_BR_CSH_WDL_DR_TXNS 0.080 FALSE FALSE
## NO_OF_ATM_DR_TXNS 0.130 FALSE FALSE
## NO_OF_NET_DR_TXNS 0.085 FALSE FALSE
## NO_OF_MOB_DR_TXNS 0.030 FALSE FALSE
## NO_OF_CHQ_DR_TXNS 0.080 FALSE FALSE
## FLG_HAS_CC 0.010 FALSE FALSE
## AMT_ATM_DR 3.230 FALSE TRUE
## AMT_BR_CSH_WDL_DR 36.845 FALSE FALSE
## AMT_CHQ_DR 24.455 FALSE FALSE
## AMT_NET_DR 24.235 FALSE FALSE
## AMT_MOB_DR 11.560 FALSE FALSE
## AMT_L_DR 47.980 FALSE FALSE
## FLG_HAS_ANY_CHGS 0.010 FALSE FALSE
## AMT_OTH_BK_ATM_USG_CHGS 0.030 FALSE TRUE
## AMT_MIN_BAL_NMC_CHGS 0.010 FALSE TRUE
## NO_OF_IW_CHQ_BNC_TXNS 0.015 FALSE TRUE
## NO_OF_OW_CHQ_BNC_TXNS 0.015 FALSE TRUE
## AVG_AMT_PER_ATM_TXN 3.760 FALSE TRUE
## AVG_AMT_PER_CSH_WDL_TXN 37.570 FALSE FALSE
## AVG_AMT_PER_CHQ_TXN 26.730 FALSE FALSE
## AVG_AMT_PER_NET_TXN 24.285 FALSE FALSE
## AVG_AMT_PER_MOB_TXN 11.580 FALSE FALSE
## FLG_HAS_NOMINEE 0.010 FALSE FALSE
## FLG_HAS_OLD_LOAN 0.010 FALSE FALSE
## random 100.000 FALSE FALSE